In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
%matplotlib inline
#html export
import plotly.io as pio
pio.renderers.default = 'notebook'
In [2]:
df = pd.read_csv('cardata.csv')
df
Out[2]:
| Make | Model | Year | Engine Fuel Type | Engine HP | Engine Cylinders | Transmission Type | Driven_Wheels | Number of Doors | Market Category | Vehicle Size | Vehicle Style | highway MPG | city mpg | Popularity | MSRP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BMW | 1 Series M | 2011 | premium unleaded (required) | 335.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Factory Tuner,Luxury,High-Performance | Compact | Coupe | 26 | 19 | 3916 | 46135 |
| 1 | BMW | 1 Series | 2011 | premium unleaded (required) | 300.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury,Performance | Compact | Convertible | 28 | 19 | 3916 | 40650 |
| 2 | BMW | 1 Series | 2011 | premium unleaded (required) | 300.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury,High-Performance | Compact | Coupe | 28 | 20 | 3916 | 36350 |
| 3 | BMW | 1 Series | 2011 | premium unleaded (required) | 230.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury,Performance | Compact | Coupe | 28 | 18 | 3916 | 29450 |
| 4 | BMW | 1 Series | 2011 | premium unleaded (required) | 230.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury | Compact | Convertible | 28 | 18 | 3916 | 34500 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11909 | Acura | ZDX | 2012 | premium unleaded (required) | 300.0 | 6.0 | AUTOMATIC | all wheel drive | 4.0 | Crossover,Hatchback,Luxury | Midsize | 4dr Hatchback | 23 | 16 | 204 | 46120 |
| 11910 | Acura | ZDX | 2012 | premium unleaded (required) | 300.0 | 6.0 | AUTOMATIC | all wheel drive | 4.0 | Crossover,Hatchback,Luxury | Midsize | 4dr Hatchback | 23 | 16 | 204 | 56670 |
| 11911 | Acura | ZDX | 2012 | premium unleaded (required) | 300.0 | 6.0 | AUTOMATIC | all wheel drive | 4.0 | Crossover,Hatchback,Luxury | Midsize | 4dr Hatchback | 23 | 16 | 204 | 50620 |
| 11912 | Acura | ZDX | 2013 | premium unleaded (recommended) | 300.0 | 6.0 | AUTOMATIC | all wheel drive | 4.0 | Crossover,Hatchback,Luxury | Midsize | 4dr Hatchback | 23 | 16 | 204 | 50920 |
| 11913 | Lincoln | Zephyr | 2006 | regular unleaded | 221.0 | 6.0 | AUTOMATIC | front wheel drive | 4.0 | Luxury | Midsize | Sedan | 26 | 17 | 61 | 28995 |
11914 rows × 16 columns
In [3]:
df.columns
Out[3]:
Index(['Make', 'Model', 'Year', 'Engine Fuel Type', 'Engine HP',
'Engine Cylinders', 'Transmission Type', 'Driven_Wheels',
'Number of Doors', 'Market Category', 'Vehicle Size', 'Vehicle Style',
'highway MPG', 'city mpg', 'Popularity', 'MSRP'],
dtype='object')
In [4]:
# make column titles consistent
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head()
Out[4]:
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BMW | 1 Series M | 2011 | premium unleaded (required) | 335.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Factory Tuner,Luxury,High-Performance | Compact | Coupe | 26 | 19 | 3916 | 46135 |
| 1 | BMW | 1 Series | 2011 | premium unleaded (required) | 300.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury,Performance | Compact | Convertible | 28 | 19 | 3916 | 40650 |
| 2 | BMW | 1 Series | 2011 | premium unleaded (required) | 300.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury,High-Performance | Compact | Coupe | 28 | 20 | 3916 | 36350 |
| 3 | BMW | 1 Series | 2011 | premium unleaded (required) | 230.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury,Performance | Compact | Coupe | 28 | 18 | 3916 | 29450 |
| 4 | BMW | 1 Series | 2011 | premium unleaded (required) | 230.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury | Compact | Convertible | 28 | 18 | 3916 | 34500 |
In [5]:
# getting columns with str type
strings = list(df.dtypes[df.dtypes == 'object'].index)
strings
Out[5]:
['make', 'model', 'engine_fuel_type', 'transmission_type', 'driven_wheels', 'market_category', 'vehicle_size', 'vehicle_style']
In [6]:
# ensuring string data has a standard format
for col in strings:
df[col] = df[col].str.lower().str.replace(' ', '_')
In [7]:
df.head()
Out[7]:
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | bmw | 1_series_m | 2011 | premium_unleaded_(required) | 335.0 | 6.0 | manual | rear_wheel_drive | 2.0 | factory_tuner,luxury,high-performance | compact | coupe | 26 | 19 | 3916 | 46135 |
| 1 | bmw | 1_series | 2011 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | convertible | 28 | 19 | 3916 | 40650 |
| 2 | bmw | 1_series | 2011 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,high-performance | compact | coupe | 28 | 20 | 3916 | 36350 |
| 3 | bmw | 1_series | 2011 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | coupe | 28 | 18 | 3916 | 29450 |
| 4 | bmw | 1_series | 2011 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury | compact | convertible | 28 | 18 | 3916 | 34500 |
Exploratory data analysis¶
In [8]:
for col in df.columns:
print(col)
print(df[col].unique()[:5]) # first 5 unique values
print(df[col].nunique())
print()
make ['bmw' 'audi' 'fiat' 'mercedes-benz' 'chrysler'] 48 model ['1_series_m' '1_series' '100' '124_spider' '190-class'] 914 year [2011 2012 2013 1992 1993] 28 engine_fuel_type ['premium_unleaded_(required)' 'regular_unleaded' 'premium_unleaded_(recommended)' 'flex-fuel_(unleaded/e85)' 'diesel'] 10 engine_hp [335. 300. 230. 320. 172.] 356 engine_cylinders [ 6. 4. 5. 8. 12.] 9 transmission_type ['manual' 'automatic' 'automated_manual' 'direct_drive' 'unknown'] 5 driven_wheels ['rear_wheel_drive' 'front_wheel_drive' 'all_wheel_drive' 'four_wheel_drive'] 4 number_of_doors [ 2. 4. 3. nan] 3 market_category ['factory_tuner,luxury,high-performance' 'luxury,performance' 'luxury,high-performance' 'luxury' 'performance'] 71 vehicle_size ['compact' 'midsize' 'large'] 3 vehicle_style ['coupe' 'convertible' 'sedan' 'wagon' '4dr_hatchback'] 16 highway_mpg [26 28 27 25 24] 59 city_mpg [19 20 18 17 16] 69 popularity [3916 3105 819 617 1013] 48 msrp [46135 40650 36350 29450 34500] 6049
Distribution of price¶
In [9]:
sns.histplot(df['msrp'], bins = 50) # bins == no. of bars
Out[9]:
<Axes: xlabel='msrp', ylabel='Count'>
In [10]:
px.histogram(df, x=df['msrp'])
In [11]:
# less expensive cars
sns.histplot(df.msrp[df['msrp'] < 100000], bins = 50)
Out[11]:
<Axes: xlabel='msrp', ylabel='Count'>
In [12]:
# less expensive cars
px.histogram(df, x=df.msrp[df['msrp'] < 100000])
∴ The prices have a long tail distribution as most prices are cheap and few expensive ones
This type of distribution is not good for ML as the tail will confuse our model
Hence we get rid of the tail by applying logarithmic distribution to the price
In [13]:
# example of how a logarithmic distribution behaves
np.log1p([1, 10, 1000, 100000])
Out[13]:
array([ 0.69314718, 2.39789527, 6.90875478, 11.51293546])
In [14]:
price_logs = np.log1p(df['msrp'])
In [15]:
sns.histplot(price_logs)
Out[15]:
<Axes: xlabel='msrp', ylabel='Count'>
When you use np.log1p (the natural logarithm of 1+𝑥), it compresses larger values more than smaller ones. This transformation reduces the gap between extremely high and moderate prices, effectively pulling the long tail back in.
Missing values¶
In [16]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 11914 entries, 0 to 11913 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 make 11914 non-null object 1 model 11914 non-null object 2 year 11914 non-null int64 3 engine_fuel_type 11911 non-null object 4 engine_hp 11845 non-null float64 5 engine_cylinders 11884 non-null float64 6 transmission_type 11914 non-null object 7 driven_wheels 11914 non-null object 8 number_of_doors 11908 non-null float64 9 market_category 8172 non-null object 10 vehicle_size 11914 non-null object 11 vehicle_style 11914 non-null object 12 highway_mpg 11914 non-null int64 13 city_mpg 11914 non-null int64 14 popularity 11914 non-null int64 15 msrp 11914 non-null int64 dtypes: float64(3), int64(5), object(8) memory usage: 1.5+ MB
In [17]:
df.isna().sum()
Out[17]:
make 0 model 0 year 0 engine_fuel_type 3 engine_hp 69 engine_cylinders 30 transmission_type 0 driven_wheels 0 number_of_doors 6 market_category 3742 vehicle_size 0 vehicle_style 0 highway_mpg 0 city_mpg 0 popularity 0 msrp 0 dtype: int64
Setting up a validation framework¶
In [18]:
# entire dataset
n = len(df)
n
Out[18]:
11914
In [19]:
# dividing dataset into 20% Validate, 20% Test, 60% Train,
n_val = int(len(df) * 0.2)
print(n_val)
print()
n_test = int(len(df) * 0.2)
print(n_test)
print()
n_train = int(len(df) * 0.6)
print(n_train)
print()
2382 2382 7148
In [20]:
n , n_val + n_test + n_train
Out[20]:
(11914, 11912)
In [21]:
# to ensure all records are used
n_val = int(len(df) * 0.2)
print(n_val)
n_test = int(len(df) * 0.2)
print(n_test)
n_train = n - n_val - n_test
print(n_train)
2382 2382 7150
In [22]:
(n , n_val + n_test + n_train)
Out[22]:
(11914, 11914)
In [23]:
# subdividing the data
df_val = df[:n_val]
df_val
Out[23]:
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | bmw | 1_series_m | 2011 | premium_unleaded_(required) | 335.0 | 6.0 | manual | rear_wheel_drive | 2.0 | factory_tuner,luxury,high-performance | compact | coupe | 26 | 19 | 3916 | 46135 |
| 1 | bmw | 1_series | 2011 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | convertible | 28 | 19 | 3916 | 40650 |
| 2 | bmw | 1_series | 2011 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,high-performance | compact | coupe | 28 | 20 | 3916 | 36350 |
| 3 | bmw | 1_series | 2011 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | coupe | 28 | 18 | 3916 | 29450 |
| 4 | bmw | 1_series | 2011 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury | compact | convertible | 28 | 18 | 3916 | 34500 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2377 | porsche | cayenne | 2016 | premium_unleaded_(required) | 570.0 | 8.0 | automatic | all_wheel_drive | 4.0 | crossover,luxury,high-performance | midsize | 4dr_suv | 21 | 14 | 1715 | 157300 |
| 2378 | porsche | cayenne | 2016 | diesel | 240.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,luxury,diesel | midsize | 4dr_suv | 29 | 20 | 1715 | 62300 |
| 2379 | porsche | cayenne | 2017 | premium_unleaded_(required) | 520.0 | 8.0 | automatic | all_wheel_drive | 4.0 | crossover,luxury,performance | midsize | 4dr_suv | 21 | 14 | 1715 | 116500 |
| 2380 | porsche | cayenne | 2017 | premium_unleaded_(required) | 300.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,luxury | midsize | 4dr_suv | 24 | 18 | 1715 | 59600 |
| 2381 | porsche | cayenne | 2017 | premium_unleaded_(required) | 440.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,luxury,high-performance | midsize | 4dr_suv | 23 | 16 | 1715 | 97200 |
2382 rows × 16 columns
In [24]:
df_test = df[n_val:n_val + n_test]
df_test
Out[24]:
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2382 | porsche | cayenne | 2017 | premium_unleaded_(required) | 570.0 | 8.0 | automatic | all_wheel_drive | 4.0 | crossover,luxury,high-performance | midsize | 4dr_suv | 21 | 14 | 1715 | 159600 |
| 2383 | porsche | cayenne | 2017 | premium_unleaded_(required) | 420.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,luxury,performance | midsize | 4dr_suv | 24 | 17 | 1715 | 76200 |
| 2384 | porsche | cayman_s | 2006 | premium_unleaded_(required) | 295.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,high-performance | compact | coupe | 26 | 18 | 1715 | 58900 |
| 2385 | porsche | cayman | 2014 | premium_unleaded_(required) | 275.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,high-performance | compact | coupe | 30 | 20 | 1715 | 52600 |
| 2386 | porsche | cayman | 2014 | premium_unleaded_(required) | 325.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,high-performance | compact | coupe | 28 | 20 | 1715 | 63800 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4759 | ford | flex | 2016 | premium_unleaded_(recommended) | 365.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover | large | wagon | 21 | 15 | 5657 | 42600 |
| 4760 | ford | flex | 2016 | regular_unleaded | 287.0 | 6.0 | automatic | front_wheel_drive | 4.0 | crossover,performance | large | wagon | 23 | 16 | 5657 | 32300 |
| 4761 | ford | flex | 2016 | regular_unleaded | 287.0 | 6.0 | automatic | front_wheel_drive | 4.0 | crossover | large | wagon | 23 | 16 | 5657 | 29600 |
| 4762 | ford | flex | 2016 | regular_unleaded | 287.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,performance | large | wagon | 22 | 16 | 5657 | 34250 |
| 4763 | ford | flex | 2016 | regular_unleaded | 287.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,performance | large | wagon | 22 | 16 | 5657 | 39750 |
2382 rows × 16 columns
In [25]:
df_train = df[n_val + n_test:]
df_train
Out[25]:
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4764 | ford | flex | 2016 | regular_unleaded | 287.0 | 6.0 | automatic | front_wheel_drive | 4.0 | crossover,performance | large | wagon | 23 | 16 | 5657 | 37800 |
| 4765 | ford | flex | 2017 | premium_unleaded_(recommended) | 365.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover | large | wagon | 21 | 15 | 5657 | 43030 |
| 4766 | ford | flex | 2017 | regular_unleaded | 287.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,performance | large | wagon | 22 | 16 | 5657 | 40180 |
| 4767 | ford | flex | 2017 | regular_unleaded | 287.0 | 6.0 | automatic | front_wheel_drive | 4.0 | crossover,performance | large | wagon | 23 | 16 | 5657 | 32730 |
| 4768 | ford | flex | 2017 | regular_unleaded | 287.0 | 6.0 | automatic | front_wheel_drive | 4.0 | crossover,performance | large | wagon | 23 | 16 | 5657 | 38230 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11909 | acura | zdx | 2012 | premium_unleaded_(required) | 300.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,hatchback,luxury | midsize | 4dr_hatchback | 23 | 16 | 204 | 46120 |
| 11910 | acura | zdx | 2012 | premium_unleaded_(required) | 300.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,hatchback,luxury | midsize | 4dr_hatchback | 23 | 16 | 204 | 56670 |
| 11911 | acura | zdx | 2012 | premium_unleaded_(required) | 300.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,hatchback,luxury | midsize | 4dr_hatchback | 23 | 16 | 204 | 50620 |
| 11912 | acura | zdx | 2013 | premium_unleaded_(recommended) | 300.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,hatchback,luxury | midsize | 4dr_hatchback | 23 | 16 | 204 | 50920 |
| 11913 | lincoln | zephyr | 2006 | regular_unleaded | 221.0 | 6.0 | automatic | front_wheel_drive | 4.0 | luxury | midsize | sedan | 26 | 17 | 61 | 28995 |
7150 rows × 16 columns
In [26]:
# shuffling the records
np.arange(n)
Out[26]:
array([ 0, 1, 2, ..., 11911, 11912, 11913])
In [27]:
idx = np.arange(n)
df.iloc[idx[:10]]
Out[27]:
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | bmw | 1_series_m | 2011 | premium_unleaded_(required) | 335.0 | 6.0 | manual | rear_wheel_drive | 2.0 | factory_tuner,luxury,high-performance | compact | coupe | 26 | 19 | 3916 | 46135 |
| 1 | bmw | 1_series | 2011 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | convertible | 28 | 19 | 3916 | 40650 |
| 2 | bmw | 1_series | 2011 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,high-performance | compact | coupe | 28 | 20 | 3916 | 36350 |
| 3 | bmw | 1_series | 2011 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | coupe | 28 | 18 | 3916 | 29450 |
| 4 | bmw | 1_series | 2011 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury | compact | convertible | 28 | 18 | 3916 | 34500 |
| 5 | bmw | 1_series | 2012 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | coupe | 28 | 18 | 3916 | 31200 |
| 6 | bmw | 1_series | 2012 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | convertible | 26 | 17 | 3916 | 44100 |
| 7 | bmw | 1_series | 2012 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,high-performance | compact | coupe | 28 | 20 | 3916 | 39300 |
| 8 | bmw | 1_series | 2012 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury | compact | convertible | 28 | 18 | 3916 | 36900 |
| 9 | bmw | 1_series | 2013 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury | compact | convertible | 27 | 18 | 3916 | 37200 |
In [28]:
np.random.seed(2) # to get the same random numbers every time you run your code
np.random.shuffle(idx)
print(idx)
[2735 6720 5878 ... 6637 2575 7336]
In [29]:
# confirming whether the dataset is shuffled by id
df.iloc[idx[:10]] # first 10 records of shuffled dataset
Out[29]:
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2735 | chevrolet | cobalt | 2008 | regular_unleaded | 148.0 | 4.0 | manual | front_wheel_drive | 2.0 | NaN | compact | coupe | 33 | 24 | 1385 | 14410 |
| 6720 | toyota | matrix | 2012 | regular_unleaded | 132.0 | 4.0 | automatic | front_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 32 | 25 | 2031 | 19685 |
| 5878 | subaru | impreza | 2016 | regular_unleaded | 148.0 | 4.0 | automatic | all_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 37 | 28 | 640 | 19795 |
| 11190 | volkswagen | vanagon | 1991 | regular_unleaded | 90.0 | 4.0 | manual | rear_wheel_drive | 3.0 | NaN | large | passenger_minivan | 18 | 16 | 873 | 2000 |
| 4554 | ford | f-150 | 2017 | flex-fuel_(unleaded/e85) | 385.0 | 8.0 | automatic | four_wheel_drive | 4.0 | flex_fuel | large | crew_cab_pickup | 21 | 15 | 5657 | 56260 |
| 8001 | volkswagen | rabbit | 2008 | regular_unleaded | 170.0 | 5.0 | manual | front_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 29 | 22 | 873 | 17575 |
| 2882 | bentley | continental_gtc | 2013 | premium_unleaded_(required) | 500.0 | 8.0 | automatic | all_wheel_drive | 2.0 | exotic,luxury,high-performance | midsize | convertible | 24 | 14 | 520 | 191400 |
| 649 | bmw | 6_series | 2015 | premium_unleaded_(required) | 315.0 | 6.0 | automatic | rear_wheel_drive | 2.0 | luxury,performance | midsize | coupe | 32 | 21 | 3916 | 76100 |
| 616 | maybach | 57 | 2012 | premium_unleaded_(required) | 543.0 | 12.0 | automatic | rear_wheel_drive | 4.0 | exotic,luxury | large | sedan | 16 | 10 | 67 | 379050 |
| 4459 | ford | f-150_heritage | 2004 | regular_unleaded | 202.0 | 6.0 | manual | four_wheel_drive | 2.0 | NaN | large | regular_cab_pickup | 18 | 13 | 5657 | 26030 |
In [30]:
# indexes of the last 60% of the dataset
idx[n_train:]
Out[30]:
array([2779, 3708, 4794, ..., 6637, 2575, 7336])
In [31]:
df_train = df.iloc[idx[:n_train]] # first 60 % to be for training
df_train
Out[31]:
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2735 | chevrolet | cobalt | 2008 | regular_unleaded | 148.0 | 4.0 | manual | front_wheel_drive | 2.0 | NaN | compact | coupe | 33 | 24 | 1385 | 14410 |
| 6720 | toyota | matrix | 2012 | regular_unleaded | 132.0 | 4.0 | automatic | front_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 32 | 25 | 2031 | 19685 |
| 5878 | subaru | impreza | 2016 | regular_unleaded | 148.0 | 4.0 | automatic | all_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 37 | 28 | 640 | 19795 |
| 11190 | volkswagen | vanagon | 1991 | regular_unleaded | 90.0 | 4.0 | manual | rear_wheel_drive | 3.0 | NaN | large | passenger_minivan | 18 | 16 | 873 | 2000 |
| 4554 | ford | f-150 | 2017 | flex-fuel_(unleaded/e85) | 385.0 | 8.0 | automatic | four_wheel_drive | 4.0 | flex_fuel | large | crew_cab_pickup | 21 | 15 | 5657 | 56260 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 434 | bmw | 4_series | 2015 | premium_unleaded_(required) | 300.0 | 6.0 | automatic | rear_wheel_drive | 2.0 | luxury,performance | midsize | convertible | 31 | 20 | 3916 | 54900 |
| 1902 | volkswagen | beetle | 2015 | premium_unleaded_(recommended) | 210.0 | 4.0 | automated_manual | front_wheel_drive | 2.0 | hatchback,performance | compact | 2dr_hatchback | 30 | 24 | 873 | 29215 |
| 9334 | gmc | sierra_1500 | 2015 | flex-fuel_(unleaded/e85) | 285.0 | 6.0 | automatic | four_wheel_drive | 4.0 | flex_fuel | large | extended_cab_pickup | 22 | 17 | 549 | 34675 |
| 5284 | rolls-royce | ghost | 2014 | premium_unleaded_(required) | 563.0 | 12.0 | automatic | rear_wheel_drive | 4.0 | exotic,luxury,performance | large | sedan | 21 | 13 | 86 | 303300 |
| 2420 | volkswagen | cc | 2017 | premium_unleaded_(recommended) | 200.0 | 4.0 | automated_manual | front_wheel_drive | 4.0 | performance | midsize | sedan | 31 | 22 | 873 | 37820 |
7150 rows × 16 columns
In [32]:
df_val = df.iloc[idx[n_train:n_train + n_val]]
df_test = df.iloc[idx[n_train + n_val:]]
df_val
Out[32]:
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2779 | chevrolet | colorado | 2015 | regular_unleaded | 200.0 | 4.0 | automatic | four_wheel_drive | 4.0 | NaN | compact | extended_cab_pickup | 25 | 19 | 1385 | 26885 |
| 3708 | mercedes-benz | e-class | 2017 | premium_unleaded_(required) | 241.0 | 4.0 | automatic | all_wheel_drive | 4.0 | luxury | midsize | sedan | 29 | 22 | 617 | 54650 |
| 4794 | ford | focus | 2017 | flex-fuel_(unleaded/e85) | 160.0 | 4.0 | manual | front_wheel_drive | 4.0 | flex_fuel | compact | sedan | 36 | 26 | 5657 | 16775 |
| 10498 | acura | tlx | 2016 | premium_unleaded_(recommended) | 290.0 | 6.0 | automatic | front_wheel_drive | 4.0 | luxury | midsize | sedan | 34 | 21 | 204 | 42600 |
| 1880 | volkswagen | beetle_convertible | 2016 | regular_unleaded | 170.0 | 4.0 | automatic | front_wheel_drive | 2.0 | NaN | compact | convertible | 34 | 25 | 873 | 25995 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11123 | volvo | v60 | 2015 | regular_unleaded | 240.0 | 4.0 | automatic | front_wheel_drive | 4.0 | luxury | midsize | wagon | 37 | 25 | 870 | 35750 |
| 5549 | maserati | granturismo_convertible | 2015 | premium_unleaded_(required) | 444.0 | 8.0 | automatic | rear_wheel_drive | 2.0 | exotic,luxury,high-performance | midsize | convertible | 20 | 13 | 238 | 145740 |
| 4146 | cadillac | escalade_hybrid | 2013 | regular_unleaded | 332.0 | 8.0 | automatic | rear_wheel_drive | 4.0 | luxury,hybrid | large | 4dr_suv | 23 | 20 | 1624 | 74425 |
| 6337 | mitsubishi | lancer | 2016 | regular_unleaded | 148.0 | 4.0 | manual | front_wheel_drive | 4.0 | NaN | compact | sedan | 34 | 24 | 436 | 17595 |
| 9814 | kia | sorento | 2015 | regular_unleaded | 290.0 | 6.0 | automatic | front_wheel_drive | 4.0 | crossover | midsize | 4dr_suv | 25 | 18 | 1720 | 26700 |
2382 rows × 16 columns
In [33]:
# resetting the index with new shuffled dataframe
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_train
Out[33]:
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | chevrolet | cobalt | 2008 | regular_unleaded | 148.0 | 4.0 | manual | front_wheel_drive | 2.0 | NaN | compact | coupe | 33 | 24 | 1385 | 14410 |
| 1 | toyota | matrix | 2012 | regular_unleaded | 132.0 | 4.0 | automatic | front_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 32 | 25 | 2031 | 19685 |
| 2 | subaru | impreza | 2016 | regular_unleaded | 148.0 | 4.0 | automatic | all_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 37 | 28 | 640 | 19795 |
| 3 | volkswagen | vanagon | 1991 | regular_unleaded | 90.0 | 4.0 | manual | rear_wheel_drive | 3.0 | NaN | large | passenger_minivan | 18 | 16 | 873 | 2000 |
| 4 | ford | f-150 | 2017 | flex-fuel_(unleaded/e85) | 385.0 | 8.0 | automatic | four_wheel_drive | 4.0 | flex_fuel | large | crew_cab_pickup | 21 | 15 | 5657 | 56260 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7145 | bmw | 4_series | 2015 | premium_unleaded_(required) | 300.0 | 6.0 | automatic | rear_wheel_drive | 2.0 | luxury,performance | midsize | convertible | 31 | 20 | 3916 | 54900 |
| 7146 | volkswagen | beetle | 2015 | premium_unleaded_(recommended) | 210.0 | 4.0 | automated_manual | front_wheel_drive | 2.0 | hatchback,performance | compact | 2dr_hatchback | 30 | 24 | 873 | 29215 |
| 7147 | gmc | sierra_1500 | 2015 | flex-fuel_(unleaded/e85) | 285.0 | 6.0 | automatic | four_wheel_drive | 4.0 | flex_fuel | large | extended_cab_pickup | 22 | 17 | 549 | 34675 |
| 7148 | rolls-royce | ghost | 2014 | premium_unleaded_(required) | 563.0 | 12.0 | automatic | rear_wheel_drive | 4.0 | exotic,luxury,performance | large | sedan | 21 | 13 | 86 | 303300 |
| 7149 | volkswagen | cc | 2017 | premium_unleaded_(recommended) | 200.0 | 4.0 | automated_manual | front_wheel_drive | 4.0 | performance | midsize | sedan | 31 | 22 | 873 | 37820 |
7150 rows × 16 columns
In [34]:
df_train['msrp'].values
Out[34]:
array([ 14410, 19685, 19795, ..., 34675, 303300, 37820], dtype=int64)
In [35]:
np.log1p(df_train['msrp'].values)
Out[35]:
array([ 9.57574708, 9.887663 , 9.89323518, ..., 10.45380308,
12.62248099, 10.54061978])
In [36]:
y_train = np.log1p(df_train['msrp'].values)
y_val = np.log1p(df_val['msrp'].values)
y_test = np.log1p(df_test['msrp'].values)
In [37]:
# delete msrp variable to avoid accidentally using it
del df_train['msrp']
del df_val['msrp']
del df_test['msrp']
In [38]:
len(y_train)
Out[38]:
7150
In [ ]: